import re
import pickle
import numpy as np
import pandas as pd
import os
import time
import sys

from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import word_split

# 评估模型
def evaluate(Y_test, Y_pred, debug=True):
    # 评估多分类的准确率、召回率、F1、Accuracy、混淆矩阵
    result = {
        "precision": precision_score(Y_test, Y_pred, average=None),
        "recall": recall_score(Y_test, Y_pred, average=None),
        "f1-score": f1_score(Y_test, Y_pred, average=None),
        "f1-score(micro)": f1_score(Y_test, Y_pred, average="micro"),
        "f1-score(macro)": f1_score(Y_test, Y_pred, average="macro"),
        "accuracy": accuracy_score(Y_test, Y_pred),
        "confusion-matrix": confusion_matrix(Y_test, Y_pred),
    }
    # 二分类单独输出准确率、召回率、F1
    if len(set(Y_test)) == 2:
        result["pre"] = precision_score(Y_test, Y_pred)
        result["rec"] = recall_score(Y_test, Y_pred)
        result["f1-score"] = f1_score(Y_test, Y_pred)

    # 保存评估结果到本地文件
    f = open("data/svm/evaluate.txt","a+")
    f.write("\n"+time.asctime(time.localtime(time.time()))+"\n")
    [f.write("{0}:{1}\n".format(k,v)) for (k,v) in result.items()]
    f.close()

    if debug: # 在debug模式下输出保存的评估结果result
        for key, value in result.items():
            if key == "confusion-matrix":
                print(key, ":\n", value)
            else:
                print(key, ":", value)
    return result

# 训练模型
def train(data, feats, label, mode=None, args=None, train_ratio=0.7, random_state=313, sample_ratio=None) -> object:
    if mode is not None: # 指定某类别为1，其余类别为0，采用二分类模式
        label_ = [1 if int(e) % N_Class == int(mode) else 0 for e in label]
    else: # 不指定类别，采用多分类模式
        label_ = [int(e) % N_Class for e in label]
    X = feats
    Y = np.array(label_)
    total_size = X.shape[0]
    train_size = int(total_size * train_ratio)
    np.random.seed(random_state) # 设置随机种子
    # 随机抽取train_size作为训练集
    train_ids = np.random.choice(np.arange(total_size), train_size, replace=False).tolist()
    test_ids = list(set(np.arange(total_size)) - set(train_ids))
    X_train, Y_train = X[train_ids], Y[train_ids]
    X_test, Y_test = X[test_ids], Y[test_ids]

     # 统计测试集X中非零元素的数量
    if mode is None:
        dic = {}
        for ii in test_ids:
            cnt = np.sum(X[ii] > 0)
            if cnt >= 10:
                cnt = 10
            if cnt not in dic:
                dic[cnt] = 0
            dic[cnt] += 1
        print(dic)

    print(np.bincount(Y_train)) # 训练集各个类别的占比
    if sample_ratio is None: # 样本平衡权重
        # sample_ratio = "auto"
        sample_ratio = {0: 500, 1: 500, 2: 500, 3: 500, 4: 500, 5:200}
    # 采用SMOTE算法重采样，目的是平衡样本
    X_train, Y_train = SMOTE(sampling_strategy=sample_ratio, random_state=313).fit_sample(X_train, Y_train)
    # X_train,Y_train = SMOTE().fit_sample(X_train,Y_train)
    # X_train,Y_train = RandomUnderSampler(random_state=233).fit_sample(X_train,Y_train)

    # clf = MultinomialNB(alpha=0.01).fit(X_train,Y_train)
    # clf = RandomForestClassifier(n_estimators=1000,max_depth=6,min_samples_leaf=20)
    # clf = GradientBoostingClassifier(max_depth=3,min_samples_split=2,min_samples_leaf=1)
    # clf = MLPClassifier(hidden_layer_sizes=(64,))
    # clf = GradientBoostingClassifier(n_estimators=1000,subsample=1.0,max_depth=4,min_samples_split=2,min_samples_leaf=2)
    if args is None: # 设置模型参数
       args = [100, 1.0, 3, 2, 2]
    # clf = GradientBoostingClassifier(n_estimators=1000,subsample=1.0,max_depth=3,min_samples_split=2,min_samples_leaf=2)
    # clf = MultinomialNB(alpha=args[1])
    # clf = RandomForestClassifier(n_estimators=args[0],max_depth=args[2],min_samples_leaf=args[3])
    # clf = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None)
    # 训练GB分类器
    clf = GradientBoostingClassifier(n_estimators=args[0], subsample=args[1], max_depth=args[2],min_samples_split=args[3], min_samples_leaf=args[4])
    clf.fit(X_train, Y_train)
    print(np.bincount(Y_train))
    Y_valid = clf.predict(X_train) # 训练集的分类结果
    Y_pred = clf.predict(X_test) # 测试集的分类结果
    # 模型的评估
    res1 = evaluate(Y_train, Y_valid, debug=False)
    res2 = evaluate(Y_test, Y_pred, debug=False)
    # if mode is None:
    #    res1 = evaluate(Y_train, Y_valid, debug=True)
    #    res2 = evaluate(Y_test, Y_pred, debug=True)
    print("Label Type = %s , f1_train = %s (acc=%s), f1_test = %s (acc=%s)" % (mode, res1["f1-score"], res1["accuracy"], res2["f1-score"], res2["accuracy"]))

    return clf  # clf.predict_proba(X)

# 输出结果到文件
def write_output_file(output_file, counter_dict):
    f = open(output_file, "w+")
    # 获取所有出现的日期（年或月）
    time_set = set()
    for k, v in counter_dict.items():
        time_set |= set(v.keys())
    time_list = sorted(list(time_set))
    f.write("标识码," + ",,,,,".join(time_list) + ",,,,,total\n")
    f.write(",r1,r2,r3,r4,r5" * len(time_list) + ",\n")
    for k, v in counter_dict.items(): # 输出分组
        res = [k]
        ss = 0
        for ts in time_list:
            if ts not in v:
                res.append("0,0,0,0,0") # 如果当前日期分组没有记录，则输出5个0
            else:
                readjust = v[ts][1:]
                # readjust.append(v[ts][0])
                res.append(",".join(list(map(str, readjust)))) # 按照记录输出各类别的数量
                ss += np.sum(readjust)
        f.write(",".join(res) + "," + str(int(ss)) + "\n")
    f.close()



def predict_csv_files(input_dir, vecobj, clf, output_file):
    st = time.time()
    # f_out = open(output_file,"w+")
    f_out = open("data/kaifaqu/merge_clean" + ".txt", "w+")
    counter_dict = dict() #按年分组统计
    counter_dict_month = dict() # 按月分组统计
    cnt_line = 0 # 行数计数器
    nfile = 0 # 文件计数器
    FORMAT_1 = re.compile(r'\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2}:\d{2}$') # 匹配日期的正则表达式
    FORMAT_2 = re.compile(r'\w*\d+$') # 匹配ID的正则表达式
    for folder in os.listdir(input_dir):
        # 获取文件夹中待分类的数据文件
        if "." not in folder:
            print("Now at folder:", folder)
            file_list = [input_dir + folder + "/" + item for item in os.listdir(input_dir + folder + "/")]
        else:
            file_list = [input_dir + folder]
        for file in file_list:
            if not file.endswith(".csv"): # 排除不是csv的文件
                continue
            # 开始处理当前数据文件
            nfile += 1
            data, info = [], []
            # f = open(file, "r", encoding="utf-8 gb18030")  #2015
            f = open(file, "r", encoding="utf-8")
            print(file)
            for line in f.readlines(): # 逐行读取
                line = line.replace("\t", "") # 替换一些特殊字符
                line = line.replace("'", "")
                line = line.replace('"', "")
                res = line[:-1].split(",") # 用逗号分割
                # content, time_stamp, _p, _c, _3, web_name, web_id = res[:7]
                content, time_stamp, web_name, web_id = res[:4]
                # print(line)

                key1 = time_stamp[:4] # 年份
                key2 = time_stamp[:4] + time_stamp[5:7] # 年份+月份
                # 过滤不合法的格式
                if not FORMAT_1.match(time_stamp) or not FORMAT_2.match(web_id):
                    continue
                # 过滤不合法的文本长度
                if len(content) == 0:  # 过滤nan
                    continue
                content = content.strip()  # 去掉前后空白
                content = re.sub(r'\s+', ' ', content)  # 去掉连续空白
                # print(content)
                valid_len = len(re.sub(r'\W+', '', content))  # 不统计标点符号
                if valid_len <= 4: # 过滤掉不完整的行
                    continue
                name = web_id
                if name in ADD_BAN_ID:
                    continue
                words = word_split.preprocess(content, True) # 分词与预处理
                data.append(words)
                info.append([name, key1, key2])
                cnt_line += 1
                # 过程中打印分类进度
                if cnt_line % 1000 == 0:
                    print("机器正在分类中，目前已分好%d条" % cnt_line)
            f.close()
            feats = word_split.get_tfidf_feature(data, vecobj) # 获取TFIDF向量
            label = clf.predict(feats) # 获取预测标签
            # 分组统计类别比例
            for i in range(len(data)):
                if info[i][0] not in counter_dict:
                    counter_dict[info[i][0]] = dict()
                    counter_dict_month[info[i][0]] = dict()
                if info[i][1] not in counter_dict[info[i][0]]:
                    counter_dict[info[i][0]][info[i][1]] = [0] * 6
                if info[i][2] not in counter_dict_month[info[i][0]]:
                    counter_dict_month[info[i][0]][info[i][2]] = [0] * 6
                counter_dict[info[i][0]][info[i][1]][label[i]] += 1
                counter_dict_month[info[i][0]][info[i][2]][label[i]] += 1
                f_out.write(str(label[i])+"\n")
    # break
    f_out.close()
    ed = time.time()
    print("Converting Time %.4lfs" % float(ed - st)) # 输出时长
    print("Total Lines: ", cnt_line)
    # 保存结果文件（按年按月统计的各类别数量）
    write_output_file("data/kaifaqu/merge_clean" + "_year.csv", counter_dict)
    write_output_file("data/kaifaqu/merge_clean" + "_month.csv", counter_dict_month)
    return

# 获取TFIDF模型
def load_vec(min_df, filename, idx=2):
    if filename is None:
        vecobj = word_split.load_object("vectorizer.obj") # 加载已经预处理好的TFIDF模型
    else:
        word_split.preprocess("Hello World!", True)
        stop_words = word_split.load_stop_words()  # "stop_words_all.txt") # 停用词
        data, _ = word_split.load_tsv(filename, idx, None)
        vecobj = word_split.get_vectorizer(data, stop_words, min_df=min_df)  # 获取词典
        word_split.save_object(vecobj, "vectorizer.obj") # 保存处理好的TFIDF模型
    return vecobj

# 加载数据集
def load_data(vecobj, filename, idx_data=2, idx_label=1):
    data, label = word_split.load_tsv(filename, idx_data=idx_data, idx_label=idx_label)
    tfidf = word_split.get_tfidf_feature(data, vecobj) # 获取TFIDF特征
    return data, label, tfidf

# 定义三种运行模式
selection = ["classify", "train", "pre-train"]

# mode = "pre-train"
mode = "train"
# mode = "classify"

if __name__ == "__main__":
    st = time.time()
    args = sys.argv
    if len(args) >= 2:
        mode = args[1]

    if mode == "classify": # 进行分类（此时模型要已经训练好，数据预处理要已经完成）
        vecobj = word_split.load_object("vectorizer.obj")
        clf = word_split.load_object("clf.obj")
        # 部委的数据
        # predict_files(["data/yang/csv_merge/Y_merge_" + str(k + 1) + ".csv" for k in range(5)], vecobj, clf, "result_y")
        predict_csv_files("data/kaifaqu/merge_clean/",vecobj,clf,"result_agency")
    # predict_csv_files("./NLP-Government-agency/",vecobj,clf,"result_agency")
    elif mode == "train": # 进行模型训练（此时数据预处理要已经完成）
        vecobj = word_split.load_object("vectorizer.obj")
        data, label, feats = load_data(vecobj, "data/BW/trainmodel/train_data_0220_25.tsv")
        # clf = train(data,feats,label,None,[800,0.001,7,5,4],sample_ratio="auto")
        # clf = train(data,feats,label,None,[200,0.005,7,5,6],0.7,258,sample_ratio="auto")
        # clf = train(data,feats,label,None,[500,0.018,7,5,6],0.7,210,sample_ratio="auto")
        # clf = train(data,feats,label,None,[1000,0.02,7,5,6],0.7,233,sample_ratio="auto")
        # clf = train(data,feats,label,None,[100,0.02,6,4,6],0.7,233,{1:500,2:500,3:300,4:1400,5:600,}) #Best
        # clf = train(data,feats,label,None,[100,0.20,6,4,6],0.7,355,{1:200,2:800,3:600,4:1400,5:600,}) #Best Best
        # clf = train(data,feats,label,None,[100,0.30,6,4,6],0.7,355,{1:200,2:800,3:600,4:1400,5:600,}) #Best Best
        # clf = train(data,feats,label,None,[100,0.40,6,4,6],0.7,355,{1:200,2:800,3:600,4:1400,5:600,}) #Best Best
        clf = train(data, feats, label, None, [100, 0.3, 6, 4, 6], 0.7, 313, "auto")  # Best Best
        word_split.save_object(clf, "clf.obj") # 把训练好的模型clf保存到本地
    elif mode == "pre-train": # 进行数据预处理（第一步操作）
        min_df = 0.0001
        vecobj = load_vec(min_df, "data/BW/BW_clean_2.tsv", 1)
        word_split.save_object(vecobj, "vectorizer.obj") # 把TFIDF模型保存到本地

    ed = time.time()
    print("Time Cost:", ed - st)

